library(tidyverse)
library(tidyr)
library(ggplot2)
library(dplyr)
library(RColorBrewer)
games_sales_full <- read_csv("games_sales_full_cleaned.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
.default = col_double(),
name = [31mcol_character()[39m,
genre = [31mcol_character()[39m,
esrb_rating = [31mcol_character()[39m,
platform = [31mcol_character()[39m,
publisher = [31mcol_character()[39m,
developer = [31mcol_character()[39m,
last_update = [31mcol_character()[39m
)
See spec(...) for full column specifications.
games_sales_full_total <- read_csv("games_sales_full_total_cleaned.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
.default = col_double(),
name = [31mcol_character()[39m,
genre = [31mcol_character()[39m,
esrb_rating = [31mcol_character()[39m,
platform = [31mcol_character()[39m,
publisher = [31mcol_character()[39m,
developer = [31mcol_character()[39m,
last_update = [31mcol_character()[39m
)
See spec(...) for full column specifications.
Due to the size of the file I will focuson on the top sales for analysis
games_sales_full_total_top_200 <- games_sales_full_total %>%
arrange(desc(total_global_sales)) %>%
slice(1:200)
games_sales_full_total_top_200
games_sales_full_total_top_50 <- games_sales_full_total %>%
arrange(desc(total_global_sales)) %>%
slice(1:50)
games_sales_full_total_top_50
summary(games_sales_full_total)
X1 rank name genre esrb_rating platform publisher developer critic_score user_score total_shipped_2019
Min. : 1 Min. : 1 Length:55792 Length:55792 Length:55792 Length:55792 Length:55792 Length:55792 Min. : 1.00 Min. : 2.00 Min. : 0.0000
1st Qu.:13949 1st Qu.:13949 Class :character Class :character Class :character Class :character Class :character Class :character 1st Qu.: 6.40 1st Qu.: 7.80 1st Qu.: 0.0000
Median :27896 Median :27896 Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character Median : 7.50 Median : 8.50 Median : 0.0000
Mean :27896 Mean :27896 Mean : 7.21 Mean : 8.25 Mean : 0.0618
3rd Qu.:41844 3rd Qu.:41844 3rd Qu.: 8.30 3rd Qu.: 9.10 3rd Qu.: 0.0000
Max. :55792 Max. :55792 Max. :10.00 Max. :10.00 Max. :82.8600
NA's :49256 NA's :55457
global_sales_2019 na_sales_2019 eu_sales_2019 jp_sales_2019 other_sales_2019 year last_update vgchartzscore na_sales_2016 eu_sales_2016 jp_sales_2016 other_sales_2016
Min. : 0.0000 Min. :0.00 Min. :0.00 Min. :0.00 Min. :0.00 Min. :1970 Length:55792 Min. :2.60 Min. :0.00 Min. :0.00 Min. :0.00 Min. :0.00
1st Qu.: 0.0000 1st Qu.:0.05 1st Qu.:0.01 1st Qu.:0.02 1st Qu.:0.00 1st Qu.:2000 Class :character 1st Qu.:6.80 1st Qu.:0.03 1st Qu.:0.00 1st Qu.:0.00 1st Qu.:0.00
Median : 0.0000 Median :0.12 Median :0.04 Median :0.05 Median :0.01 Median :2008 Mode :character Median :7.80 Median :0.09 Median :0.01 Median :0.00 Median :0.01
Mean : 0.1272 Mean :0.28 Mean :0.16 Mean :0.11 Mean :0.04 Mean :2006 Mean :7.43 Mean :0.14 Mean :0.06 Mean :0.01 Mean :0.02
3rd Qu.: 0.0400 3rd Qu.:0.29 3rd Qu.:0.14 3rd Qu.:0.12 3rd Qu.:0.04 3rd Qu.:2011 3rd Qu.:8.50 3rd Qu.:0.18 3rd Qu.:0.05 3rd Qu.:0.00 3rd Qu.:0.02
Max. :20.3200 Max. :9.76 Max. :9.85 Max. :2.69 Max. :3.12 Max. :2020 Max. :9.60 Max. :1.22 Max. :1.23 Max. :0.81 Max. :0.25
NA's :42828 NA's :42603 NA's :48749 NA's :40270 NA's :979 NA's :54993 NA's :55238 NA's :55238 NA's :55238 NA's :55238
global_sales_2016 total_global_sales
Min. :0.000000 Min. : 0.0000
1st Qu.:0.000000 1st Qu.: 0.0000
Median :0.000000 Median : 0.0000
Mean :0.002245 Mean : 0.1912
3rd Qu.:0.000000 3rd Qu.: 0.0700
Max. :2.150000 Max. :82.8600
games_sales_full_total_top_200 %>%
group_by(genre) %>%
ggplot(aes(x = genre, fill = genre) ) +
geom_bar() +
labs(title = "Games count by genre") +
labs(x = "Genre") +
labs(y = "Number of Games") +
labs(fill = "Genre") +
coord_flip()
games_sales_full_total_top_200 %>%
group_by(genre) %>%
ggplot(aes(x = genre, fill = genre) ) +
geom_bar() +
theme(legend.position="none") +
labs(title = "Games count by genre") +
labs(x = "Genre") +
labs(y = "Number of Games") +
labs(fill = "Genre") +
coord_flip()
ggsave("genre_count.png")
Saving 7.29 x 4.51 in image
games_sales_full_total_top_200 %>%
group_by(publisher) %>%
ggplot(aes(x = genre, fill = publisher) ) +
geom_bar() +
labs(title = "Games count by publisher and genre") +
labs(x = "Genre") +
labs(y = "Number of Games") +
labs(fill = "Publisher") +
coord_flip()
games_sales_full_total_top_200 %>%
group_by(developer) %>%
ggplot(aes(x = genre, fill = developer) ) +
geom_bar() +
labs(title = "Games count by developer and genre") +
labs(x = "Genre") +
labs(y = "Number of Games") +
labs(fill = "Developer") +
coord_flip()
games_sales_full_total_top <- games_sales_full_total %>%
arrange(desc(total_global_sales)) %>%
slice(1:50)
games_sales_full_total_top
NA
games_sales_full_total_top %>%
group_by(developer) %>%
ggplot(aes(x = genre, fill = developer) ) +
geom_bar() +
labs(title = "Games count by developer and genre") +
labs(x = "Genre") +
labs(y = "Number of Games") +
labs(fill = "Developer") +
coord_flip()
ggsave("genre_developer.png")
Saving 7.29 x 4.51 in image
games_sales_full_total_top %>%
group_by(developer) %>%
ggplot(aes(x = developer, fill = genre) ) +
geom_bar() +
labs(title = "Games count by developer and genre") +
labs(x = "Developer") +
labs(y = "Number of Games") +
labs(fill = "Genre") +
coord_flip()
ggsave("developer_genre.png")
Saving 7.29 x 4.51 in image
games_sales_full_total_top_games <- games_sales_full_total %>%
arrange(desc(total_global_sales)) %>%
slice(1:20)
games_sales_full_total_top_games
games_sales_full_total_top_games %>%
group_by(name) %>%
ggplot(aes(x = name, y = total_global_sales, fill = name)) +
geom_col() +
labs(title = "Top 20 games sales by name") +
labs(x = "Name") +
labs(y = "Top 20 global sales") +
labs(fill = "Name") +
coord_flip()
ggsave("top20_games.png")
Saving 7.29 x 4.51 in image
games_sales_full_total_top_games %>%
group_by(name) %>%
ggplot(aes(x = name, y = total_global_sales, fill = name)) +
geom_col() +
theme(legend.position="none") +
labs(title = "Top 20 games sales by name") +
labs(x = "Name") +
labs(y = "Top 20 global sales") +
labs(fill = "Name") +
coord_flip()
ggsave("top20_games.png")
Saving 7.29 x 4.51 in image
games_sales_full_total_top_games_genre <- games_sales_full_total %>%
arrange(desc(total_global_sales)) %>%
slice(1:40)
games_sales_full_total_top_games_genre
games_sales_full_total_top_games_genre_select <- games_sales_full_total %>%
arrange(desc(total_global_sales)) %>%
select(rank, name, genre, platform, total_global_sales) %>%
slice(1:50)
games_sales_full_total_top_games_genre_select
games_sales_full_total_top_games_genre %>%
group_by(name) %>%
ggplot(aes(x = name, y = total_global_sales, fill = genre)) +
geom_col() +
labs(title = "Top 40 games sales by name and genre") +
labs(x = "Name") +
labs(y = "Top 40 global sales") +
labs(fill = "Genre") +
coord_flip()
ggsave("top40_games_name_genre.png")
Saving 7.29 x 4.51 in image
k-means
#games_sales_full_total_top_games_genre %>%
# unnest(cols = c(augmented)) %>%
#filter(k == 2) %>%
# ggplot(aes(x = murder, y = assault, colour = .cluster, label = .rownames)) +
# geom_point(aes(color = .cluster)) +
# geom_text(hjust = 0, vjust = - 0.5, size = 3)
games_sales_full_total_top_200 %>%
group_by(developer) %>%
ggplot(aes(x = genre, fill = platform)) +
geom_bar() +
labs(title = "Games count by platform and genre") +
labs(x = "Genre") +
labs(y = "Number of Games") +
labs(fill = "Platform") +
coord_flip()
ggsave("count_platform_genre.png")
Saving 7.29 x 4.51 in image
games_sales_full_total_top_200 %>%
group_by(developer) %>%
ggplot(aes(x = platform, fill = genre)) +
geom_bar() +
labs(title = "Games count by genre and platform") +
labs(x = "Platform") +
labs(y = "Number of Games") +
labs(fill = "Genre") +
coord_flip()
ggsave("count_genre_platform.png")
Saving 7.29 x 4.51 in image
Already used as graph 1
games_sales_full_total_top_200 %>%
group_by(genre) %>%
ggplot(aes(x = genre, fill = genre)) +
geom_bar() +
labs(title = "Games count by genre") +
labs(x = "Genre") +
labs(y = "Number of Games") +
labs(fill = "Genre") +
coord_flip()
ggsave("genre_count.png")
Saving 7.29 x 4.51 in image
Games by platform
games_sales_full_total_top_200 %>%
group_by(genre) %>%
ggplot(aes(x = platform, fill = platform) ) +
geom_bar() +
labs(title = "Games count by Platform") +
labs(x = "Platform") +
labs(y = "Number of Games") +
labs(fill = "Platform") +
coord_flip()
games_sales_full_total_top_200 %>%
group_by(genre) %>%
ggplot(aes(x = platform, fill = platform) ) +
geom_bar() +
theme(legend.position="none") +
labs(title = "Games count by Platform") +
labs(x = "Platform") +
labs(y = "Number of Games") +
labs(fill = "Platform") +
coord_flip()
ggsave("platform_count.png")
Saving 7.29 x 4.51 in image
Games by region - global market
games_sales_full_longer %>%
group_by(genre) %>%
ggplot(aes(x = sales_region, y = sales_region_millions) ) +
geom_line() +
labs(title = "Geographic breakdown of Global Sales") +
labs(x = "Specific global market") +
labs(y = "Sales (millions)") +
labs(fill = "Genre") +
coord_flip()
games_sales_full_total_top_200 %>%
group_by(genre) %>%
ggplot(aes(x = total_global_sales, fill = genre) ) +
geom_bar() +
labs(title = "Geographic breakdown of Global Sales") +
labs(x = "Sales (millions)") +
labs(y = "Specific global market") +
labs(fill = "Genre") +
coord_flip()
games_sales_full_total_top_200 %>%
group_by(genre) %>%
ggplot(aes(x = total_global_sales, fill = genre)) +
geom_bar() +
theme(legend.position="none") +
labs(title = "Geographic breakdown of Global Sales") +
labs(x = "Sales (millions)") +
labs(y = "Specific global market") +
labs(fill = "Genre") +
coord_flip()
games_sales_full_total_top_200 %>%
gather("id", "value", 13:16) %>%
ggplot(., aes(total_global_sales, value))+
geom_point()+
geom_smooth(method = "lm", se=FALSE, color="black")+
facet_wrap(~id)
games_sales_full_longer <- read_csv("games_sales_full_longer.csv")
Missing column names filled in: 'X1' [1]Duplicated column names deduplicated: 'X1' => 'X1_1' [2]Parsed with column specification:
cols(
.default = col_double(),
name = [31mcol_character()[39m,
genre = [31mcol_character()[39m,
esrb_rating = [31mcol_character()[39m,
platform = [31mcol_character()[39m,
publisher = [31mcol_character()[39m,
developer = [31mcol_character()[39m,
last_update = [31mcol_character()[39m,
na_sales_2016 = [33mcol_logical()[39m,
eu_sales_2016 = [33mcol_logical()[39m,
jp_sales_2016 = [33mcol_logical()[39m,
other_sales_2016 = [33mcol_logical()[39m,
sales_region = [31mcol_character()[39m
)
See spec(...) for full column specifications.
6100 parsing failures.
row col expected actual file
4441 na_sales_2016 1/0/T/F/TRUE/FALSE 1.21 'games_sales_full_longer.csv'
4441 eu_sales_2016 1/0/T/F/TRUE/FALSE 0.75 'games_sales_full_longer.csv'
4441 other_sales_2016 1/0/T/F/TRUE/FALSE 0.19 'games_sales_full_longer.csv'
4442 na_sales_2016 1/0/T/F/TRUE/FALSE 1.21 'games_sales_full_longer.csv'
4442 eu_sales_2016 1/0/T/F/TRUE/FALSE 0.75 'games_sales_full_longer.csv'
.... ................ .................. ...... .............................
See problems(...) for more details.
games_sales_full_bind_total_longer <- read_csv("games_sales_full_bind_total_longer.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
X1 = [32mcol_double()[39m,
name = [31mcol_character()[39m,
platform = [31mcol_character()[39m,
genre = [31mcol_character()[39m,
publisher = [31mcol_character()[39m,
global_sales = [32mcol_double()[39m,
critic_score = [32mcol_double()[39m,
developer = [31mcol_character()[39m,
year_data = [32mcol_double()[39m,
rank = [33mcol_logical()[39m,
esrb_rating = [33mcol_logical()[39m,
user_score = [33mcol_logical()[39m,
total_shipped = [33mcol_logical()[39m,
year = [33mcol_logical()[39m,
last_update = [33mcol_logical()[39m,
vgchartzscore = [33mcol_logical()[39m,
total_global_sales = [33mcol_logical()[39m,
sales_region = [31mcol_character()[39m,
sales_region_millions = [32mcol_double()[39m
)
560728 parsing failures.
row col expected actual file
66877 esrb_rating 1/0/T/F/TRUE/FALSE E 'games_sales_full_bind_total_longer.csv'
66877 total_shipped 1/0/T/F/TRUE/FALSE 82.86 'games_sales_full_bind_total_longer.csv'
66877 year 1/0/T/F/TRUE/FALSE 2006 'games_sales_full_bind_total_longer.csv'
66878 esrb_rating 1/0/T/F/TRUE/FALSE E 'games_sales_full_bind_total_longer.csv'
66878 total_shipped 1/0/T/F/TRUE/FALSE 82.86 'games_sales_full_bind_total_longer.csv'
..... ............. .................. ...... ........................................
See problems(...) for more details.
games_sales_full_bind_total_longer %>%
group_by(sales_region, year_data) %>%
summarise(Total = sum(sales_region_millions, na.rm = TRUE))
`summarise()` regrouping output by 'sales_region' (override with `.groups` argument)
games_sales_full_bind_total_longer %>%
group_by(sales_region, year_data) %>%
summarise(Total = sum(sales_region_millions, na.rm = TRUE)) %>%
ggplot(aes(x = sales_region, y = Total)) +
geom_bar(stat = "identity") +
labs(title = "Total gobal sales by region") +
labs(x = "Global market region") +
labs(y = "Total sales (millions)") +
facet_wrap(~year_data)
`summarise()` regrouping output by 'sales_region' (override with `.groups` argument)
ggplot(games_sales_full_bind_total_longer, aes(sales_region, sales_region_millions)) +
geom_boxplot() +
labs(title = "Distribution of sales by genre") +
labs(x = "Genre") +
labs(y = "Sales (millions)") +
coord_flip()
games_sales_full_bind_total_longer_region %>%
ggplot(aes(x = sales_region, y = sales_region_millions)) +
geom_col() +
labs(title = "Total gobal sales by region") +
labs(x = "Global market region") +
labs(y = "Total sales (millions)") +
scale_fill_brewer () +
facet_wrap(~year_data)
games_sales_full_bind_total<- read_csv("games_sales_full_bind_total.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
.default = col_double(),
name = [31mcol_character()[39m,
platform = [31mcol_character()[39m,
genre = [31mcol_character()[39m,
publisher = [31mcol_character()[39m,
developer = [31mcol_character()[39m,
rank = [33mcol_logical()[39m,
esrb_rating = [33mcol_logical()[39m,
user_score = [33mcol_logical()[39m,
total_shipped = [33mcol_logical()[39m,
year = [33mcol_logical()[39m,
last_update = [33mcol_logical()[39m,
vgchartzscore = [33mcol_logical()[39m,
total_global_sales = [33mcol_logical()[39m
)
See spec(...) for full column specifications.
140182 parsing failures.
row col expected actual file
16720 esrb_rating 1/0/T/F/TRUE/FALSE E 'games_sales_full_bind_total.csv'
16720 total_shipped 1/0/T/F/TRUE/FALSE 82.86 'games_sales_full_bind_total.csv'
16720 year 1/0/T/F/TRUE/FALSE 2006 'games_sales_full_bind_total.csv'
16721 rank 1/0/T/F/TRUE/FALSE 2 'games_sales_full_bind_total.csv'
16721 total_shipped 1/0/T/F/TRUE/FALSE 40.24 'games_sales_full_bind_total.csv'
..... ............. .................. ...... .................................
See problems(...) for more details.
games_sales_full_bind_total_longer %>%
ggplot(aes(x = sales_region, y = sales_region_millions, fill = genre)) +
geom_col() +
labs(title = "Total global sales by region") +
labs(x = "Total sales") +
labs(y = "Global market region")
ggsave("sales_region_genre.png")
Saving 7.29 x 4.51 in image
XXXXXXXX WRONG DS
games_sales_full_bind_total_longer %>%
ggplot(aes(x = genre, y = sales_region_millions, fill = sales_region)) +
geom_col() +
labs(title = "Total global sales by genre and region") +
labs(x = "Global market region") +
labs(y = "Total sales") +
coord_flip()
ggsave("sales_genre_region.png")
Saving 7.29 x 4.51 in image
Correct DS
games_sales_full_bind_total_longer %>%
group_by(sales_region, year_data, genre, sales_region_millions) %>%
summarise(Total = sum(sales_region_millions, na.rm = TRUE)) %>%
ggplot(aes(x = genre, y = sales_region_millions, fill = sales_region)) +
geom_col() +
labs(title = "Total global sales by genre and region") +
labs(x = "Global market region") +
labs(y = "Total sales") +
coord_flip()
`summarise()` regrouping output by 'sales_region', 'year_data', 'genre' (override with `.groups` argument)
ggsave("sales_genre_region.png")
Saving 7.29 x 4.51 in image
Games mean (max/ min) sales
ggplot(games_sales_full_total_top_200, aes(genre, total_global_sales)) +
geom_boxplot() +
labs(title = "Distribution of sales by genre") +
labs(x = "Genre") +
labs(y = "Sales (millions)") +
coord_flip()
ggsave("distribution_sales_by_genre.png")
Saving 7.29 x 4.51 in image
ggplot(games_sales_full_total_top_200, aes(platform, total_global_sales)) +
geom_boxplot() +
labs(title = "Distribution of sales by platform") +
labs(x = "Platform") +
labs(y = "Sales (millions)") +
coord_flip()
ggsave("distribution_sales_by_platform.png")
Saving 7.29 x 4.51 in image
shipped / critic score